################################################
# Analysis for:
#
# Warner, Zach, Garrett N. Vande Kamp, and Soren Jordan.
# ``Conditional Relationships in Dynamic Models.'' 
#  Political Science Research & Methods
#
# Morgan and Kelly Supplemental Analysis (Section 7.2)
#
# Required files: sm-mk-jop.dta
#                 interp_urdf.R (for interpretation)
#
################################################

# Set your working directory here

library(dplyr)
library(ggplot2)
library(grid)
library(haven)
library(lmtest)
library(dynamac)
library(urca)
library(tseries)
library(polywog)
library(MASS)
library(plyr)
library(caret)
library(foreign)

completeFun <- function(data, desiredCols) {
  completeVec <- complete.cases(data[, desiredCols])
  return(data[completeVec, ])
}

cl <- function(dat,fm, cluster){
  library(sandwich, quietly = TRUE)
  library(lmtest, quietly = TRUE)
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) 
}

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  if (numPlots==1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

r2.corr <- function(m) {
  lmfit <-  lm(model.response(model.frame(m)) ~ fitted(m))
  summary(lmfit)$r.squared
} 

# ur.df output is annoying. interp_urdf.R helper function for interpretation
#  Author: Hank Roark. 
#  https://gist.github.com/hankroark/968fc28b767f1e43b5a33b151b771bf9
source("interp_urdf.R")

##############################################################################################################################
# SM replication: Morgan and Kelly
##############################################################################################################################
##### M+K REPLICATION #####
repdata <- read.dta("sm-mk-jop.dta")

### create lags
# DVs
for(i in 3:19){
  repdata[,i] <- as.numeric(repdata[,i])
}
for(i in colnames(repdata)[c(3:19)]){
  eval(parse(text=paste("repdata <- ddply(repdata, .(idn), transform, l.",i," = c(NA, ",i,"[-length(",i,")]))", sep="")))
  eval(parse(text=paste("repdata <- ddply(repdata, .(idn), transform, d.",i," = c(NA, diff(",i,",lag=1)))", sep="")))
}

### create their subset to be used in the models
repdata.nona <- completeFun(repdata,c("l.gini_gross", "d.lpbp_cum15", "l.lpbp_cum15", "d.csedhlth_cent", "l.csedhlth_cent",
                                      "d.csssw_cent", "l.csssw_cent", "d.rgdpch_cent", "l.rgdpch_cent", "d.inflationcpimfbr",
                                      "l.inflationcpimfbr", "d.unemplywdi", "l.unemplywdi", "d.inwardstockgdp", "l.inwardstockgdp", 
                                      "d.demrss01_cum45", "l.demrss01_cum45", "d.repressive_cum15", "l.repressive_cum15", "d.edyears",
                                      "l.edyears", "d.pop014wdi", "l.pop014wdi", "l.ethdiv"))

######### Table SM. 4; ``M&K model'' model #########
repmod3 <- lm(d.gini_gross ~ l.gini_gross + d.lpbp_cum15 + d.csedhlth_cent + l.csedhlth_cent + d.rgdpch_cent +
                d.rgdpch_cent*l.csedhlth_cent + d.inflationcpimfbr + l.inflationcpimfbr + d.unemplywdi + d.inwardstockgdp + 
                l.inwardstockgdp + d.edyears, data=repdata.nona)
# Cluster SEs, Model 3
cl(repdata.nona,repmod3,repdata.nona$country)
BIC(repmod3)
r2.corr(repmod3)

######### Table SM. 4; `General'' model #########
ecmfull <- lm(d.gini_gross ~ l.gini_gross + d.lpbp_cum15 + d.csedhlth_cent + l.csedhlth_cent + d.rgdpch_cent + l.rgdpch_cent +
                d.rgdpch_cent*l.csedhlth_cent + l.rgdpch_cent*d.csedhlth_cent + d.rgdpch_cent*d.csedhlth_cent + 
                l.rgdpch_cent*l.csedhlth_cent + d.inflationcpimfbr + l.inflationcpimfbr + d.unemplywdi + d.inwardstockgdp + 
                l.inwardstockgdp + d.edyears, data=repdata.nona)
# Cluster SEs, Model 3
cl(repdata.nona,ecmfull,repdata.nona$country)
BIC(ecmfull)
r2.corr(ecmfull)

# out of sample prediction
tc <- trainControl("cv",5)
train(d.gini_gross ~ l.gini_gross + d.lpbp_cum15 + d.csedhlth_cent + l.csedhlth_cent + d.rgdpch_cent +
        d.rgdpch_cent*l.csedhlth_cent + d.inflationcpimfbr + l.inflationcpimfbr + d.unemplywdi + d.inwardstockgdp + 
        l.inwardstockgdp + d.edyears, data=repdata.nona,
      method="glm",trControl=tc)$results$RMSE
train(d.gini_gross ~ l.gini_gross + d.lpbp_cum15 + d.csedhlth_cent + l.csedhlth_cent + d.rgdpch_cent + l.rgdpch_cent +
        d.rgdpch_cent*l.csedhlth_cent + l.rgdpch_cent*d.csedhlth_cent + d.rgdpch_cent*d.csedhlth_cent + 
        l.rgdpch_cent*l.csedhlth_cent + d.inflationcpimfbr + l.inflationcpimfbr + d.unemplywdi + d.inwardstockgdp + 
        l.inwardstockgdp + d.edyears, data=repdata.nona,
      method="glm",trControl=tc)$results$RMSE

#######################################################################
##    M & K's key finding is that growth increases inequality        ##
##    when investment is low, but decreases it when                  ##
##    investment is high. We can study this by holding               ##
##    *change* in investment at its mean, but varying the            ##
##    *level* of investment, and studying growth's effects.          ##
#######################################################################

pred_effects_d.spending <- function(d.spending){
  thetas <- coef(ecmfull)
  varcov <- vcov(ecmfull)
  theta.tilde <- data.frame(mvrnorm(5000,thetas,varcov))
  theta.tilde <- theta.tilde[,c(2,6,7,4,5,14:17)] # all relevant variables. This order treats GDP as x and spending as z
  colnames(theta.tilde) <- c("gamma_1","theta_0","theta_1","theta_2","theta_3","theta_4","theta_5",
                             "theta_6","theta_7") # notation of Eq 4 in my paper
  # create relevant comparisons for human capital spending
  spending <- seq(unname(summary(repdata.nona$csedhlth_cent)[1]),
                  unname(summary(repdata.nona$csedhlth_cent)[6]),length.out=250) # vary level
  # create hypothetical values based on these
  hyp_zt <- spending
  hyp_zt_l1 <- spending - d.spending
  hyp_zt_p1 <- spending + d.spending
  # create list to loop over
  hyplist <- as.list(NULL)
  # create storage for results
  effects <- data.frame(matrix(ncol=7,nrow=length(spending)))
  colnames(effects) <- c("spending","inst.lo","inst.est","inst.hi","total.lo","total.est","total.hi")
  for(i in 1:length(spending)){
    ### create temp storage
    hyplist[[i]] <- theta.tilde
    
    ### create hypotheticals
    # lagged level
    hyplist[[i]]$theta_2_hyp_zt_l1 <- hyplist[[i]]$theta_2*hyp_zt_l1[i]
    hyplist[[i]]$theta_3_hyp_zt_l1 <- hyplist[[i]]$theta_3*hyp_zt_l1[i]
    hyplist[[i]]$theta_4_hyp_zt_l1 <- hyplist[[i]]$theta_4*hyp_zt_l1[i]
    hyplist[[i]]$theta_5_hyp_zt_l1 <- hyplist[[i]]$theta_5*hyp_zt_l1[i]
    hyplist[[i]]$theta_6_hyp_zt_l1 <- hyplist[[i]]$theta_6*hyp_zt_l1[i]
    hyplist[[i]]$theta_7_hyp_zt_l1 <- hyplist[[i]]$theta_7*hyp_zt_l1[i]
    # current change
    hyplist[[i]]$theta_2_hyp_d <- hyplist[[i]]$theta_2*d.spending
    hyplist[[i]]$theta_3_hyp_d <- hyplist[[i]]$theta_3*d.spending
    hyplist[[i]]$theta_4_hyp_d <- hyplist[[i]]$theta_4*d.spending
    hyplist[[i]]$theta_5_hyp_d <- hyplist[[i]]$theta_5*d.spending
    hyplist[[i]]$theta_6_hyp_d <- hyplist[[i]]$theta_6*d.spending
    hyplist[[i]]$theta_7_hyp_d <- hyplist[[i]]$theta_7*d.spending
    # current level
    hyplist[[i]]$theta_2_hyp_zt <- hyplist[[i]]$theta_2*hyp_zt[i]
    hyplist[[i]]$theta_3_hyp_zt <- hyplist[[i]]$theta_3*hyp_zt[i]
    hyplist[[i]]$theta_4_hyp_zt <- hyplist[[i]]$theta_4*hyp_zt[i]
    hyplist[[i]]$theta_5_hyp_zt <- hyplist[[i]]$theta_5*hyp_zt[i]
    hyplist[[i]]$theta_6_hyp_zt <- hyplist[[i]]$theta_6*hyp_zt[i]
    hyplist[[i]]$theta_7_hyp_zt <- hyplist[[i]]$theta_7*hyp_zt[i]
    # future level
    hyplist[[i]]$theta_2_hyp_p1 <- hyplist[[i]]$theta_2*hyp_zt_p1[i]
    hyplist[[i]]$theta_3_hyp_p1 <- hyplist[[i]]$theta_3*hyp_zt_p1[i]
    hyplist[[i]]$theta_4_hyp_p1 <- hyplist[[i]]$theta_4*hyp_zt_p1[i]
    hyplist[[i]]$theta_5_hyp_p1 <- hyplist[[i]]$theta_5*hyp_zt_p1[i]
    hyplist[[i]]$theta_6_hyp_p1 <- hyplist[[i]]$theta_6*hyp_zt_p1[i]
    hyplist[[i]]$theta_7_hyp_p1 <- hyplist[[i]]$theta_7*hyp_zt_p1[i]
    # future change
    hyplist[[i]]$theta_2_hyp_dfut <- hyplist[[i]]$theta_2*d.spending
    hyplist[[i]]$theta_3_hyp_dfut <- hyplist[[i]]$theta_3*d.spending
    hyplist[[i]]$theta_4_hyp_dfut <- hyplist[[i]]$theta_4*d.spending
    hyplist[[i]]$theta_5_hyp_dfut <- hyplist[[i]]$theta_5*d.spending
    hyplist[[i]]$theta_6_hyp_dfut <- hyplist[[i]]$theta_6*d.spending
    hyplist[[i]]$theta_7_hyp_dfut <- hyplist[[i]]$theta_7*d.spending
    # negative future level
    hyplist[[i]]$theta_2_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_2*hyp_zt_p1[i])
    hyplist[[i]]$theta_3_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_3*hyp_zt_p1[i])
    hyplist[[i]]$theta_4_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_4*hyp_zt_p1[i])
    hyplist[[i]]$theta_5_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_5*hyp_zt_p1[i])
    hyplist[[i]]$theta_6_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_6*hyp_zt_p1[i])
    hyplist[[i]]$theta_7_hyp_p1_neg <- (-1)*(hyplist[[i]]$theta_7*hyp_zt_p1[i])
    # negative future change
    hyplist[[i]]$theta_2_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_2*d.spending)
    hyplist[[i]]$theta_3_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_3*d.spending)
    hyplist[[i]]$theta_4_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_4*d.spending)
    hyplist[[i]]$theta_5_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_5*d.spending)
    hyplist[[i]]$theta_6_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_6*d.spending)
    hyplist[[i]]$theta_7_hyp_dfut_neg <- (-1)*(hyplist[[i]]$theta_7*d.spending)
    
    ### calculate quantities of interest
    hyplist[[i]]$inst.effect <- rowSums(hyplist[[i]][,c("theta_0", "theta_4_hyp_zt","theta_5_hyp_d","theta_6_hyp_d")])
    hyplist[[i]]$total.effect <- (rowSums(hyplist[[i]][,c("theta_1", "theta_7_hyp_zt","theta_5_hyp_d","theta_6_hyp_d",
                                                          "theta_4_hyp_dfut_neg","theta_6_hyp_dfut_neg")]))/((-1)*(hyplist[[i]]$gamma_1))
    effects$spending[i] <- hyp_zt[i]
    effects$inst.lo[i] <- unname(quantile(hyplist[[i]]$inst.effect, .025))
    effects$inst.est[i] <- unname(summary(hyplist[[i]]$inst.effect)[4])
    effects$inst.hi[i] <- unname(quantile(hyplist[[i]]$inst.effect, .975))
    effects$total.lo[i] <- unname(quantile(hyplist[[i]]$total.effect, .025))
    effects$total.est[i] <- unname(summary(hyplist[[i]]$total.effect)[4])
    effects$total.hi[i] <- unname(quantile(hyplist[[i]]$total.effect, .975))
  }
  return(effects)
}

effects <- pred_effects_d.spending(d.spending =  mean(repdata.nona$d.csedhlth_cent))

# plot
apfig9.1 <- ggplot(effects, aes(x=spending,y=inst.est)) + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank(),panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_ribbon(aes(ymin=inst.lo, ymax=inst.hi), alpha=.25) +
  geom_line(aes(x=spending,y=inst.est)) +
  geom_hline(aes(yintercept=0),linetype="dashed",alpha=.7) +
  coord_cartesian(ylim=c(-7, 7)) +
  labs(title="Instantaneous effects",x="Human capital spending",y=expression(paste(Delta, "Market Gini"))) # +
 # theme(text=element_text(family="minpro"))

apfig9.2 <- ggplot(effects, aes(x=spending,y=total.est)) + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank(),panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_ribbon(aes(ymin=total.lo, ymax=total.hi), alpha=.25) +
  geom_line(aes(x=spending,y=total.est)) +
  geom_hline(aes(yintercept=0),linetype="dashed",alpha=.7) +
  coord_cartesian(ylim=c(-7, 7)) +
  labs(title="Total effects",x="Human capital spending",y=expression(paste(Delta, "Market GINI, cumulative")))# +
  # theme(text=element_text(family="minpro"))


######### Figure SM. 18 #########
pdf("sm-mk-hcs.pdf",width=8,height=4)
multiplot(apfig9.1, apfig9.2, cols=2)
dev.off()

# let's look at temporal effects where spending is held at mean
temp_effects_d.spending <- function(d.spending){
  thetas <- coef(ecmfull)
  varcov <- vcov(ecmfull)
  theta.tilde <- data.frame(mvrnorm(5000,thetas,varcov))
  theta.tilde <- theta.tilde[,c(2,6,7,4,5,14:17)] # all relevant variables. This order treats GDP as x and spending as z
  colnames(theta.tilde) <- c("gamma_1","theta_0","theta_1","theta_2","theta_3","theta_4","theta_5",
                             "theta_6","theta_7") # notation of Eq 4 in my paper
  # create relevant comparisons for human capital spending
  spending <- mean(repdata.nona$csedhlth_cent, na.rm = T) # hold at mean
  # create hypothetical values based on these
  hyp_zt <- spending
  hyp_zt_l1 <- spending - d.spending
  hyp_zt_p1 <- spending + d.spending
  # create storage for results
  temp.effects <- data.frame(matrix(nrow=nrow(theta.tilde),ncol=21))
  
  ### create hypotheticals
  # lagged level
  theta.tilde$theta_2_hyp_zt_l1 <- theta.tilde$theta_2*hyp_zt_l1
  theta.tilde$theta_3_hyp_zt_l1 <- theta.tilde$theta_3*hyp_zt_l1
  theta.tilde$theta_4_hyp_zt_l1 <- theta.tilde$theta_4*hyp_zt_l1
  theta.tilde$theta_5_hyp_zt_l1 <- theta.tilde$theta_5*hyp_zt_l1
  theta.tilde$theta_6_hyp_zt_l1 <- theta.tilde$theta_6*hyp_zt_l1
  theta.tilde$theta_7_hyp_zt_l1 <- theta.tilde$theta_7*hyp_zt_l1
  # current change
  theta.tilde$theta_2_hyp_d <- theta.tilde$theta_2*d.spending
  theta.tilde$theta_3_hyp_d <- theta.tilde$theta_3*d.spending
  theta.tilde$theta_4_hyp_d <- theta.tilde$theta_4*d.spending
  theta.tilde$theta_5_hyp_d <- theta.tilde$theta_5*d.spending
  theta.tilde$theta_6_hyp_d <- theta.tilde$theta_6*d.spending
  theta.tilde$theta_7_hyp_d <- theta.tilde$theta_7*d.spending
  # current level
  theta.tilde$theta_2_hyp_zt <- theta.tilde$theta_2*hyp_zt
  theta.tilde$theta_3_hyp_zt <- theta.tilde$theta_3*hyp_zt
  theta.tilde$theta_4_hyp_zt <- theta.tilde$theta_4*hyp_zt
  theta.tilde$theta_5_hyp_zt <- theta.tilde$theta_5*hyp_zt
  theta.tilde$theta_6_hyp_zt <- theta.tilde$theta_6*hyp_zt
  theta.tilde$theta_7_hyp_zt <- theta.tilde$theta_7*hyp_zt
  # future level
  theta.tilde$theta_2_hyp_zt_p1 <- theta.tilde$theta_2*hyp_zt_p1
  theta.tilde$theta_3_hyp_zt_p1 <- theta.tilde$theta_3*hyp_zt_p1
  theta.tilde$theta_4_hyp_zt_p1 <- theta.tilde$theta_4*hyp_zt_p1
  theta.tilde$theta_5_hyp_zt_p1 <- theta.tilde$theta_5*hyp_zt_p1
  theta.tilde$theta_6_hyp_zt_p1 <- theta.tilde$theta_6*hyp_zt_p1
  theta.tilde$theta_7_hyp_zt_p1 <- theta.tilde$theta_7*hyp_zt_p1
  # future change
  theta.tilde$theta_2_hyp_dfut <- theta.tilde$theta_2*d.spending
  theta.tilde$theta_3_hyp_dfut <- theta.tilde$theta_3*d.spending
  theta.tilde$theta_4_hyp_dfut <- theta.tilde$theta_4*d.spending
  theta.tilde$theta_5_hyp_dfut <- theta.tilde$theta_5*d.spending
  theta.tilde$theta_6_hyp_dfut <- theta.tilde$theta_6*d.spending
  theta.tilde$theta_7_hyp_dfut <- theta.tilde$theta_7*d.spending
  
  ### calculate quantities of interest
  for(k in 1:21){
    for(j in 1:nrow(theta.tilde)){
      # subtracting 1 off of the exponents so that it loops into the right place, e.g. the instantaneous effect in row 1, column 1
      temp.effects[j,k] <- (
        (theta.tilde$theta_0[j] + theta.tilde$theta_4_hyp_zt[j] + theta.tilde$theta_5_hyp_d[j] + 
           theta.tilde$theta_6_hyp_d[j])*(1-(theta.tilde$gamma_1[j] + 1)^(k+1-1)) +
          (theta.tilde$theta_1[j] - theta.tilde$theta_0[j] - theta.tilde$theta_4_hyp_zt_p1[j] - 
             theta.tilde$theta_6_hyp_dfut[j] + theta.tilde$theta_7_hyp_zt[j])*(1-(theta.tilde$gamma_1[j] + 1)^(k-1))
      )/(-theta.tilde$gamma_1[j])
    } 
  }
  temp.effects.mean <- apply(temp.effects,2,mean)
  temp.effects <-apply(temp.effects,2,quantile,c(.025,.975))
  temp.effects.df <- data.frame(year=0:(ncol(temp.effects)-1),t(rbind(temp.effects,temp.effects.mean)))
  colnames(temp.effects.df) <- c("year","lower","upper","mean")
  return(temp.effects.df)
}

temp.effects.df <- temp_effects_d.spending(d.spending =  mean(repdata.nona$d.csedhlth_cent))

apfig10 <- ggplot(temp.effects.df, aes(x=year,y=mean)) + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank(),panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_ribbon(aes(ymin=lower, ymax=upper), alpha=.25) +
  geom_line(aes(x=year,y=mean)) +
  geom_hline(aes(yintercept=0),linetype="dashed",alpha=.7) +
  coord_cartesian(ylim=c(-2,1),xlim=c(0,20)) +
  labs(title="Cumulative effects",x="Year",y=expression(paste(Delta, "Market GINI, cumulative"))) # +
  #theme(text=element_text(family="minpro"))


######### Figure SM. 19 #########
pdf("sm-mk-cumulative.pdff",width=4,height=4)
plot(apfig10)
dev.off()






##### DIAGNOSING TIME SERIES PROPERTIES #####
######### Table SM. 10 #########

repdata$interaction <- repdata$csedhlth_cent*repdata$rgdpch_cent
repdata <- ddply(repdata, .(country), transform, d_interaction =
                   c(NA,diff(interaction,lag=1)))
intdata <- completeFun(repdata,c("interaction","country","year"))
countries <- unique(intdata$country)
ps.adf <- ps.box <- ps.kpss <- as.vector(NULL)
# test the interaction for nonstationarity
for(i in 1:length(countries)){
  tmp <- intdata[intdata$country==countries[i],]
  print(countries[i])
  print(adf.test(tmp$interaction))
  ps.adf <- c(ps.adf,adf.test(tmp$interaction)$p.value)
  print(Box.test(tmp$interaction))
  ps.box <- c(ps.box,Box.test(tmp$interaction)$p.value)
  print(kpss.test(tmp$interaction))
  ps.kpss <- c(ps.kpss,kpss.test(tmp$interaction)$p.value)
} # looks like a unit root
mean(ps.adf); length(ps.adf[which(ps.adf < .05)])/length(ps.adf)
mean(ps.box); length(ps.box[which(ps.box < .05)])/length(ps.box)
mean(ps.kpss); length(ps.kpss[which(ps.kpss < .05)])/length(ps.kpss)

intdata <- completeFun(repdata,c("d_interaction","country","year"))
countries <- unique(intdata$country)
ps.adf <- ps.box <- ps.kpss <- as.vector(NULL)
# test the first difference for nonstationarity
for(i in 1:length(countries)){
  tmp <- intdata[intdata$country==countries[i],]
  print(countries[i])
  print(adf.test(tmp$d_interaction))
  ps.adf <- c(ps.adf,adf.test(tmp$d_interaction)$p.value)
  print(Box.test(tmp$d_interaction))
  ps.box <- c(ps.box,Box.test(tmp$d_interaction)$p.value)
  print(kpss.test(tmp$d_interaction))
  ps.kpss <- c(ps.kpss,kpss.test(tmp$d_interaction)$p.value)
} # looks much better. should we continue differencing? let's check the ACFs/PACFs
mean(ps.adf); length(ps.adf[which(ps.adf < .05)])/length(ps.adf)
mean(ps.box); length(ps.box[which(ps.box < .05)])/length(ps.box)
mean(ps.kpss); length(ps.kpss[which(ps.kpss < .05)])/length(ps.kpss)




### some visual diagnostics. this will spit out ACFs and PACFs for each country -- lots of new PDFs in your wd
# This is just illustrative; none of the below is reported in the SM

for(i in 1:length(countries)){
  # ACFs
  tmp <- intdata[intdata$country==countries[i],]
  eval(parse(text=paste("tmpdata <- Acf(tmp$d_interaction, lag.max=25, na.action = na.pass, main='",countries[i],"', plot=F)",sep="")))
  tmpdata <- data.frame(lag=tmpdata$lag, acf=tmpdata$acf)
  tmpdata$min <- tmpdata$max <- rep(NA,nrow(tmpdata))
  tmpdata$max[1] <- 1/sqrt(nrow(tmp))
  for(j in 2:nrow(tmpdata)){
    tmpdata$max[j] <- (sqrt((1/nrow(tmp))*(1+(2*sum((tmpdata$max[c(1:tmpdata$lag[j])])^2)))))
  }
  tmpdata$max <- qnorm(.975)*tmpdata$max
  tmpdata$min <- -1*(tmpdata$max)
  filename <- paste("./diagnostic plots/mk-acf-",countries[i],".pdf",sep="")
  # pdf(filename,width=6,height=3.25)
  showtext.begin()
  print(ggplot(tmpdata, aes(lag, acf)) + 
          theme_bw() + geom_ribbon(aes(ymin=min, ymax=max),fill="grey70") +
          geom_lollipop(point.colour="darkblue", colour="darkblue") +
          coord_cartesian(ylim=c(-1,1),xlim=c(0,max(tmpdata$lag))) +
          labs(title=paste("ACF: ",countries[i],sep=""),x="Lag",y="Autocorrelations of the interaction term") +
          geom_hline(aes(yintercept=0)) +
          theme(text=element_text(family="minpro")))
  showtext.end()
  dev.off()
  # PACFs
  eval(parse(text=paste("tmpdata <- Pacf(tmp$d_interaction, lag.max=25, na.action = na.pass, main='",countries[i],"', plot=F)",sep="")))
  tmpdata <- data.frame(lag=tmpdata$lag, pcf=tmpdata$acf)
  filename <- paste("./diagnostic plots/mk-pacf-",countries[i],".pdf",sep="")
  # pdf(filename,width=6,height=3.25)
  showtext.begin()
  print(ggplot(tmpdata, aes(lag, pcf)) + 
          theme_bw() + geom_ribbon(aes(ymin=(qnorm(.975)/sqrt(nrow(tmp))), ymax=(-qnorm(.975)/sqrt(nrow(tmp)))),fill="grey70") +
          geom_lollipop(point.colour="darkblue", colour="darkblue") +
          coord_cartesian(ylim=c(-1,1),xlim=c(0,max(tmpdata$lag))) +
          labs(title=paste("PACF: ",countries[i],sep=""),x="Lag",y="Partial autocorrelations of the interaction term") +
          geom_hline(aes(yintercept=0)) +
          theme(text=element_text(family="minpro")))
  showtext.end()
  dev.off()
} # Looks stationary to me... So our best guess, given the data constraints, is a unit root process. 



######### Table SM. 11 #########
#... and now let's confirm that everything is cointegrated.
# looking at ECM residuals (i.e. 1-step procedure)
ecmfull <- lm(d.gini_gross ~ l.gini_gross + d.lpbp_cum15 + d.csedhlth_cent + l.csedhlth_cent + d.rgdpch_cent + l.rgdpch_cent +
                d.rgdpch_cent*l.csedhlth_cent + l.rgdpch_cent*d.csedhlth_cent + d.rgdpch_cent*d.csedhlth_cent + 
                l.rgdpch_cent*l.csedhlth_cent + d.inflationcpimfbr + l.inflationcpimfbr + d.unemplywdi + d.inwardstockgdp + 
                l.inwardstockgdp + d.edyears, data=repdata.nona)
Box.test(resid(ecmfull),lag=20,type="Ljung-Box") # can't reject null -- this suggests stationary residuals (i.e. cointegration)
adf.test(resid(ecmfull)) # reject null of non-stationarity, so evidence residuals are stationary (i.e., cointegration)
kpss.test(resid(ecmfull)) # do not reject null of no unit root, so evidence residuals are stationary (i.e., cointegration)

# Engle-Granger 2-step test
repdata.eg <- completeFun(repdata, c("gini_gross","lpbp_cum15","csedhlth_cent","rgdpch_cent","interaction",
                                     "inflationcpimfbr","unemplywdi","inwardstockgdp","edyears"))
m1 <- lm(gini_gross ~ lpbp_cum15 + csedhlth_cent + rgdpch_cent + interaction + inflationcpimfbr + unemplywdi + 
           inwardstockgdp + edyears, data = repdata.eg)
residuals <- resid(m1)
res.ADF<-ur.df(residuals,type="none",selectlags="AIC")
summary(res.ADF) # reject null, so stationary, and cointegration.



######### Table SM. 12 #########
# A formal Johansen test
dat <- data.frame(cbind(repdata$gini_gross, repdata$lpbp_cum15, repdata$csedhlth_cent, repdata$rgdpch_cent,
                        repdata$interaction, repdata$inflationcpimfbr, repdata$unemplywdi, repdata$inwardstockgdp, 
                        repdata$edyears))
ecmtest <- ca.jo(dat,type="trace",ecdet="const")
summary(ecmtest) # evidence of ~7 cointegrating vectors. also the same if we use repdata.nona
summary(ca.jo(dat,type="trace",ecdet="trend")) # ~5 cointegrating vectors. either way, still cointegrated

# looks the same with only nona data
repdata.nona$interaction <- repdata.nona$csedhlth_cent*repdata.nona$rgdpch_cent
dat <- data.frame(cbind(repdata.nona$gini_gross, repdata.nona$lpbp_cum15, repdata.nona$csedhlth_cent, repdata.nona$rgdpch_cent,
                        repdata.nona$interaction, repdata.nona$inflationcpimfbr, repdata.nona$unemplywdi, repdata.nona$inwardstockgdp, 
                        repdata.nona$edyears))
ecmtest <- ca.jo(dat,type="trace",ecdet="const")
summary(ecmtest)
summary(ca.jo(dat,type="trace",ecdet="trend"))

# So it looks to me like, combined with the results of the M+K pretesting, we have strong evidence of
# unit roots and cointegration.


